---
title: "Arizona Department of Environmental Quality's Flow Regime Screening Tools: Analysis"
author: "Matthew Robinson"
date: "2024-08-22"
output: pdf_document
---

# Purpose

For the Data Accessibility section of the manuscript titled "Development of Flow Regime Screening Tools to Differentiate Intermittent Rivers and Ephemeral Streams in Arizona."

The purpose of this code is to select thresholds for ADEQ's Riparian Vegetation Tool and Groundwater Tools. Previously, threshold selection was completed in Excel and only considered narrow windows of threshold values. Revised values for the Riparian Vegetation Tool (in % riparian cover) and the Groundwater Tool (average depth to groundwater in feet) were calculated for both tools for this analysis, which differ (some, not all) from the original values used in the White Papers (2021). Results of the recalculations are used in the imported dataset in this code. 

Using the new values, this code generates a confusion matrix for multiple potential thresholds for each tool. For the Riparian Vegetation Tool the thresholds range from 0-100% riparian cover, and for the Groundwater Tool the thresholds range from 0-100 feet. 

Once the Confusion matrices are created, we pull the stats for each confusion matrix (sensitivity, precision, F1-score, accuracy and the p-value for accuracy).

This code also generates figures used in the manuscript.

## Setup

```{r, message= FALSE, warning= FALSE}

# clear the environment
rm(list = ls())

# read packages
library(dplyr)
library(ggplot2)
library(caret)
library(ggsignif)
```

## Read in datasets

```{r}

setwd("M:\\AZDEQ\\Publications\\ScreeningToolsTestR\\AGU_Materials\\DataAccessibility\\ToFigShare")

tools = read.csv(file = "ADEQ_FlowRegime_ScreeningTools_Publication_Data.csv",header=TRUE, na.strings = "",)

```

## Clean Dataset

```{r}

# Sites selected for publication are indicated by "Y" in the [Y.N] column. 
tools = tools %>% 
  # for confusion matrices we need a TRUTH column (1 or 0, where 1 is intermittent, and 0 is ephemeral)
  mutate(TRUTH = ifelse(Flow.Regime.List == "Intermittent", 1,0))

# Groundwater DF
groundwater = tools %>% 
  dplyr::select(TRUTH, GWT.Recalcs) %>% 
# Remove NA
  filter(!GWT.Recalcs == "NA") %>% 
  # Remove any depth to groundwater values < -10 (decision made by group)
  mutate(GWT.Recalcs = as.numeric(GWT.Recalcs)) %>% 
  filter(!GWT.Recalcs <= -10)

# Riparian Vegetation Cover DF
riparianveg = tools %>% 
  dplyr::select(TRUTH, Riparian.Tool.Recalcs)

```

# Summary Stats & Plots

```{r}
# Riparian

# Summary
rip.summary = riparianveg %>% 
  group_by(TRUTH) %>% 
  summarize(av = mean(Riparian.Tool.Recalcs),
            min = min(Riparian.Tool.Recalcs),
            max = max(Riparian.Tool.Recalcs),
            sd = sd(Riparian.Tool.Recalcs),
            count = n())

# Data for plot
rip.veg.plot = riparianveg %>% 
  mutate(TRUTH = as.factor(TRUTH)) %>% 
  mutate(TRUTH = factor(TRUTH, labels = c("Ephemeral", "Intermittent")))


# Plot 
  rip.plot= ggplot(rip.veg.plot, aes(x = Riparian.Tool.Recalcs, fill = TRUTH)) + 
  geom_histogram(alpha = 0.5, position = "identity", bins = nrow(rip.veg.plot), color = "black") + 
  labs(fill = "Flow regime", color = "Flow regime", y = "Count", x = "Percent riparian corridor vegetation cover") +
 scale_fill_manual(values = c("black", "white")) +
  xlim(0,100) +
  ylim(0,4) +
  theme_classic() + 
  theme(legend.position = c(0.9, 0.9))

  rip.plot
  
```

```{r}
## Groundwater

gw.summary = groundwater %>% 
  group_by(TRUTH) %>% 
  summarize(av = mean(GWT.Recalcs),
            min = min(GWT.Recalcs),
            max = max(GWT.Recalcs),
            sd = sd(GWT.Recalcs),
            count = n())

# Data for plot
groundwater.plot = groundwater %>% 
  mutate(TRUTH = as.factor(TRUTH)) %>% 
  mutate(TRUTH = factor(TRUTH, labels = c("Ephemeral", "Intermittent")))

# Plot
gwt.plot = ggplot(groundwater.plot, aes(x = GWT.Recalcs, fill = TRUTH)) + 
  geom_histogram(alpha = 0.5, position = "identity", bins = nrow(groundwater.plot), color = "black") + 
  labs(fill = "Flow regime", color = "Flow regime", y = "Count", x = "Depth to groundwater (feet)") +
 scale_fill_manual(values = c("black", "white")) +
  xlim(0,500) +
  theme_classic() + 
  theme(legend.position = c(0.9, 0.9))

gwt.plot

```

Snowpack Tool Graphics

```{r}
setwd("M:\\AZDEQ\\Publications\\ScreeningToolsTestR\\AGU_Materials\\DataAccessibility\\ToFigShare")
snow = read.csv("ADEQ_SnowpackTool_AvePercYears_graphic.csv")

names(snow) = c("elev", "percent", "value")

snow = ggplot(snow, aes(x = reorder(elev, value), y = percent))+ 
  geom_bar(stat = "identity", fill = "gray")+
  theme_classic() +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  geom_text(aes(label = percent), vjust = -.5) +
  geom_rect(aes(xmin = 10.55, xmax = 12.45, ymin = 0, ymax = 100), fill = "yellow", color = "yellow", alpha = 0.02) +
  labs(x = "Elevation bands", y = "Percent of years with snowpack")

snow

```


# Make a loop for these confusion matrices to run every threshold.

## Depth to Groundwater Tool Confusion Matrix Loop

```{r}
# Create a results table
groundwater.results = data.frame(threshold = NA, TruePositive = NA, FalseNegative = NA, FalsePositive = NA, TrueNegative = NA,
                  Accuracy = NA, Kappa = NA, AccuracyLower = NA,  AccuracyUpper = NA,   AccuracyNull = NA,  AccuracyPValue = NA,      
                  McnemarPValue = NA, Sensitivity = NA, Specificity = NA, PosPredValue = NA, NegPredValue = NA, Precision = NA, 
                  Recall = NA, F1 = NA, Prevelance = NA, DetectionRate = NA, DetectionPrevalance = NA, BalcancedAccuracy = NA)

# Setup the thresholds we want to test
thresh = seq(0,100,1)

# Begin the loop. 

for (x in thresh){

# Setup the confusion matrix

# Make TRUTH a factor
groundwater$TRUTH = factor(groundwater$TRUTH, levels = c("1", "0"))

# make a string of 1 or 0 for using our new threshold
# iterate through the thresholds with "x"
m_or_r = ifelse(groundwater$GWT.Recalcs < x, "1", "0")

# This is our predicted 1 or 0 based on the threshold as a factor
p_class = factor(m_or_r, levels = levels(groundwater[["TRUTH"]]))

# Run the confusion Matrix
confusionMatrix(p_class, groundwater[["TRUTH"]])


# We can save the confusion matrix as an object so I can pull from it
cm = confusionMatrix(p_class, groundwater[["TRUTH"]])

# populate the DF using "x" as the index to populate the given threshold and row #

groundwater.results[x,1] = x
# I can call pieces of the object out
cm.table = cm$table # this is the confusion matrix
groundwater.results[x,2] = cm.table[1,1] # 3   True Positive   1:1
groundwater.results[x,3] = cm.table[2,1] # 10  False Negative  1:0
groundwater.results[x,4] = cm.table[1,2] # 3   False Positive  0:1
groundwater.results[x,5] = cm.table[2,2] # 13  True Negative   0:0

cm.overall = cm$overall # pulls the accuracy info     7 variables
groundwater.results[x,6] = cm.overall[1] # Accuracy 
groundwater.results[x,7] = cm.overall[2] # Kappa
groundwater.results[x,8] = cm.overall[3] # AccuracyLower
groundwater.results[x,9] = cm.overall[4] # AccuracyUpper
groundwater.results[x,10] = cm.overall[5] # AccuracyNull
groundwater.results[x,11] = cm.overall[6] # AccuracyPValue
groundwater.results[x,12] = cm.overall[7] # McnemarPValue

cm.class = cm$byClass # pulls specificity... etc.  11 variables 
groundwater.results[x,13] = cm.class[1]  # Sensitivity 
groundwater.results[x,14] = cm.class[2]  # Specificity
groundwater.results[x,15] = cm.class[3]  # Pos Pred Value
groundwater.results[x,16] = cm.class[4]  # Neg Pred Value 
groundwater.results[x,17] = cm.class[5]  # Precision
groundwater.results[x,18] = cm.class[6]  # Recall
groundwater.results[x,19] = cm.class[7]  # F1
groundwater.results[x,20] = cm.class[8]  # Prevalence 
groundwater.results[x,21] = cm.class[9]  # Detection Rate 
groundwater.results[x,22] = cm.class[10] # Detection Prevalence
groundwater.results[x,23] = cm.class[11] # Balanced Accuracy 
}

```

## Riparian Vegetation Tool Confusion Matrix Loop

```{r}
# Create a results table
riparian.results = data.frame(threshold = NA, TruePositive = NA, FalseNegative = NA, FalsePositive = NA, TrueNegative = NA,
                  Accuracy = NA, Kappa = NA, AccuracyLower = NA,  AccuracyUpper = NA,   AccuracyNull = NA,  AccuracyPValue = NA,      
                  McnemarPValue = NA, Sensitivity = NA, Specificity = NA, PosPredValue = NA, NegPredValue = NA, Precision = NA, 
                  Recall = NA, F1 = NA, Prevelance = NA, DetectionRate = NA, DetectionPrevalance = NA, BalcancedAccuracy = NA)

# Setup the thresholds we want to test
thresh = seq(0,100,1)

# Begin the loop. 

for (x in thresh){

# Setup the confusion matrix

# Make TRUTH a factor
riparianveg$TRUTH = factor(riparianveg$TRUTH, levels = c("1", "0"))

# make a string of 1 or 0 for using our new threshold
# iterate through the thresholds with "x"
m_or_r = ifelse(riparianveg$Riparian.Tool.Recalcs < x, "0", "1")

# This is our predicted 1 or 0 based on the threshold as a factor
p_class = factor(m_or_r, levels = levels(riparianveg[["TRUTH"]]))

# Run the confusion Matrix
confusionMatrix(p_class, riparianveg[["TRUTH"]])


# We can save the confusion matrix as an object so I can pull from it
cm = confusionMatrix(p_class, riparianveg[["TRUTH"]])

# populate the DF using "x" as the index to populate the given threshold and row #

riparian.results[x,1] = x
# I can call pieces of the object out
cm.table = cm$table # this is the confusion matrix
riparian.results[x,2] = cm.table[1,1] # 3   True Positive   1:1
riparian.results[x,3] = cm.table[2,1] # 10  False Negative  1:0
riparian.results[x,4] = cm.table[1,2] # 3   False Positive  0:1
riparian.results[x,5] = cm.table[2,2] # 13  True Negative   0:0

cm.overall = cm$overall # pulls the accuracy info     7 variables
riparian.results[x,6] = cm.overall[1] # Accuracy 
riparian.results[x,7] = cm.overall[2] # Kappa
riparian.results[x,8] = cm.overall[3] # AccuracyLower
riparian.results[x,9] = cm.overall[4] # AccuracyUpper
riparian.results[x,10] = cm.overall[5] # AccuracyNull
riparian.results[x,11] = cm.overall[6] # AccuracyPValue
riparian.results[x,12] = cm.overall[7] # McnemarPValue

cm.class = cm$byClass # pulls specificity... etc.  11 variables 
riparian.results[x,13] = cm.class[1]  # Sensitivity 
riparian.results[x,14] = cm.class[2]  # Specificity
riparian.results[x,15] = cm.class[3]  # Pos Pred Value
riparian.results[x,16] = cm.class[4]  # Neg Pred Value 
riparian.results[x,17] = cm.class[5]  # Precision
riparian.results[x,18] = cm.class[6]  # Recall
riparian.results[x,19] = cm.class[7]  # F1
riparian.results[x,20] = cm.class[8]  # Prevalence 
riparian.results[x,21] = cm.class[9]  # Detection Rate 
riparian.results[x,22] = cm.class[10] # Detection Prevalence
riparian.results[x,23] = cm.class[11] # Balanced Accuracy 
}

```
